#!/usr/bin/ksh
#
# iotop - display top disk I/O events by process.
#         Written using DTrace (Solaris 10 3/05).
#
# This is measuring disk events that have made it past system caches.
#
# $Id: iotop 8 2007-08-06 05:55:26Z brendan $
#
# USAGE:	iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename] 
#		      [-m mount_point] [-t top] [interval [count]]
#
#		iotop   	# default output, 5 second intervals
#
#		-C		# don't clear the screen
#		-D		# print delta times, elapsed, us
#		-j		# print project ID
#		-o		# print disk delta times, us
#		-P		# print %I/O (disk delta times)
#		-Z		# print zone ID
#		-d device	# instance name to snoop (eg, dad0)
#		-f filename	# full pathname of file to snoop
#		-m mount_point	# this FS only (will skip raw events)
#		-t top		# print top number only
#	eg,
#		iotop 1  	# 1 second samples
#		iotop -C	# don't clear the screen
#		iotop -P	# print %I/O (time based)
#		iotop -j	# print project IDs
#		iotop -Z 	# print zone IDs
#		iotop -t 20 	# print top 20 lines only
#		iotop -C 5 12	# print 12 x 5 second samples
# 	
# FIELDS:
#		UID		user ID
#		PID		process ID
#		PPID		parent process ID
#		PROJ		project ID
#		ZONE		zone ID
#		CMD		process command name
#		DEVICE  	device name
#		MAJ     	device major number
#		MIN     	device minor number
#		D		direction, Read or Write
#		BYTES		total size of operations, bytes
#		ELAPSED		total elapsed from request to completion, us
#		DISKTIME	total time for disk to complete request, us
#		%I/O		percent disk I/O, based on time (DISKTIME)
#		load		1 min load average
#		disk_r		total disk read Kbytes for sample
#		disk_w		total disk write Kbytes for sample
# 
# NOTE:
# * There are two different delta times reported. -D prints the
#   elapsed time from the disk request (strategy) to the disk completion
#   (iodone); -o prints the time for the disk to complete that event 
#   since it's last event (time between iodones), or, the time to the
#   strategy if the disk had been idle. 
# * The %I/O value can exceed 100%. It represents how busy a process is
#   making the disks, in terms of a single disk. A value of 200% could 
#   mean 2 disks are busy at 100%, or 4 disks at 50%...
#
# SEE ALSO: iosnoop
#	    BigAdmin: DTrace, http://www.sun.com/bigadmin/content/dtrace
#	    Solaris Dynamic Tracing Guide, http://docs.sun.com
#	    DTrace Tools, http://www.brendangregg.com/dtrace.html
#
# INSPIRATION:  top(1) by William LeFebvre
#
# COPYRIGHT: Copyright (c) 2005, 2006 Brendan Gregg.
#
# CDDL HEADER START
#
#  The contents of this file are subject to the terms of the
#  Common Development and Distribution License, Version 1.0 only
#  (the "License").  You may not use this file except in compliance
#  with the License.
#
#  You can obtain a copy of the license at Docs/cddl1.txt
#  or http://www.opensolaris.org/os/licensing.
#  See the License for the specific language governing permissions
#  and limitations under the License.
#
# CDDL HEADER END
#
# KNOWN BUGS: 
# - This can print errors while running on servers with Veritas volumes.
#
# Author: Brendan Gregg  [Sydney, Australia]
#
# 15-Jul-2005	Brendan Gregg	Created this.
# 20-Apr-2006	   "      "	Last update.
#


##############################
# --- Process Arguments ---
#

### default variables
opt_device=0; opt_file=0; opt_mount=0; opt_clear=1; opt_proj=0; opt_zone=0
opt_percent=0; opt_def=1; opt_bytes=1; filter=0; device=.; filename=.; mount=.
opt_top=0; opt_elapsed=0; opt_dtime=0; interval=5; count=-1; top=0

### process options
while getopts CDd:f:hjm:oPt:Z name
do
	case $name in
	C)	opt_clear=0 ;;
	D)	opt_elapsed=1; opt_bytes=0 ;;
	d)	opt_device=1; device=$OPTARG ;;
	f)	opt_file=1; filename=$OPTARG ;;
	j)	opt_proj=1; opt_def=0 ;;
	m)	opt_mount=1; mount=$OPTARG ;;
	o)	opt_dtime=1; opt_bytes=0 ;;
	P)	opt_percent=1; opt_dtime=1; opt_bytes=0 ;;
	t)	opt_top=1; top=$OPTARG ;;
	Z)	opt_zone=1; opt_def=0 ;;
	h|?)	cat <<-END >&2
		USAGE: iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename]
		             [-m mount_point] [-t top] [interval [count]]
 
		                -C      # don't clear the screen
		                -D      # print delta times, elapsed, us
		                -j      # print project ID
		                -o      # print disk delta times, us
		                -P      # print %I/O (disk delta times)
		                -Z      # print zone ID
		                -d device       # instance name to snoop 
		                -f filename     # snoop this file only
		                -m mount_point  # this FS only 
		                -t top  	# print top number only
		   eg,
		        iotop         # default output, 5 second samples
		        iotop 1       # 1 second samples
		        iotop -P      # print %I/O (time based)
		        iotop -m /    # snoop events on filesystem / only
		        iotop -t 20   # print top 20 lines only
		        iotop -C 5 12 # print 12 x 5 second samples
		END
		exit 1
	esac
done

shift $(( $OPTIND - 1 ))

### option logic
if [[ "$1" > 0 ]]; then
        interval=$1; shift
fi
if [[ "$1" > 0 ]]; then
        count=$1; shift
fi
if (( opt_proj && opt_zone )); then
        opt_proj=0
fi
if (( opt_elapsed && opt_dtime )); then
        opt_elapsed=0
fi
if (( opt_device || opt_mount || opt_file )); then
	filter=1
fi
if (( opt_clear )); then
        clearstr=`clear`
else
        clearstr=.
fi



#################################
# --- Main Program, DTrace ---
#
/usr/sbin/dtrace -n '
 /*
  * Command line arguments
  */
 inline int OPT_def 	= '$opt_def';
 inline int OPT_proj 	= '$opt_proj';
 inline int OPT_zone 	= '$opt_zone';
 inline int OPT_clear 	= '$opt_clear';
 inline int OPT_bytes 	= '$opt_bytes';
 inline int OPT_elapsed = '$opt_elapsed';
 inline int OPT_dtime 	= '$opt_dtime';
 inline int OPT_percent	= '$opt_percent';
 inline int OPT_device 	= '$opt_device';
 inline int OPT_mount 	= '$opt_mount';
 inline int OPT_file 	= '$opt_file';
 inline int OPT_top 	= '$opt_top';
 inline int INTERVAL 	= '$interval';
 inline int COUNTER 	= '$count';
 inline int FILTER 	= '$filter';
 inline int TOP 	= '$top';
 inline string DEVICE 	= "'$device'";
 inline string FILENAME = "'$filename'";
 inline string MOUNT 	= "'$mount'";
 inline string CLEAR 	= "'$clearstr'";
 
 #pragma D option quiet

 /* boost the following if you get "dynamic variable drops" */
 #pragma D option dynvarsize=8m

 /*
  * Print header
  */
 dtrace:::BEGIN 
 {
	last_event[""] = 0;

        /* starting values */
        counts = COUNTER;
        secs = INTERVAL;
        disk_r = 0;
        disk_w = 0;

        printf("Tracing... Please wait.\n");
 }

 /*
  * Check event is being traced
  */
 io:genunix::start,
 io:genunix::done 
 { 
	/* default is to trace unless filtering, */
	this->ok = FILTER ? 0 : 1;

	/* check each filter, */
	(OPT_device == 1 && DEVICE == args[1]->dev_statname)? this->ok = 1 : 1;
	(OPT_file == 1 && FILENAME == args[2]->fi_pathname) ? this->ok = 1 : 1;
	(OPT_mount == 1 && MOUNT == args[2]->fi_mount)  ? this->ok = 1 : 1;
 }

 /*
  * Reset last_event for disk idle -> start
  * this prevents idle time being counted as disk time.
  */
 io:genunix::start
 /! pending[args[1]->dev_statname]/
 {
	/* save last disk event */
	last_event[args[1]->dev_statname] = timestamp;
 }

 /*
  * Store entry details
  */
 io:genunix::start
 /this->ok/
 {
	/* these are used as a unique disk event key, */
 	this->dev = args[0]->b_edev;
 	this->blk = args[0]->b_blkno;

	/* save disk event details, */
 	start_uid[this->dev, this->blk] = uid;
 	start_pid[this->dev, this->blk] = pid;
 	start_ppid[this->dev, this->blk] = ppid;
 	start_comm[this->dev, this->blk] = execname;
 	start_time[this->dev, this->blk] = timestamp;
 	start_proj[this->dev, this->blk] = curpsinfo->pr_projid;
 	start_zone[this->dev, this->blk] = curpsinfo->pr_zoneid;
 	start_rw[this->dev, this->blk] = args[0]->b_flags & B_READ ? "R" : "W";
	disk_r += args[0]->b_flags & B_READ ? args[0]->b_bcount : 0;
	disk_w += args[0]->b_flags & B_READ ? 0 : args[0]->b_bcount;

	/* increase disk event pending count */
	pending[args[1]->dev_statname]++;
 }

 /*
  * Process and Print completion
  */
 io:genunix::done
 /this->ok/
 {
	/* decrease disk event pending count */
	pending[args[1]->dev_statname]--;

	/*
	 * Process details
	 */

 	/* fetch entry values */
 	this->dev = args[0]->b_edev;
 	this->blk = args[0]->b_blkno;
 	this->suid = start_uid[this->dev, this->blk];
 	this->spid = start_pid[this->dev, this->blk];
 	this->sppid = start_ppid[this->dev, this->blk];
 	this->sproj = start_proj[this->dev, this->blk];
 	this->szone = start_zone[this->dev, this->blk];
 	self->scomm = start_comm[this->dev, this->blk];
 	this->stime = start_time[this->dev, this->blk];
	this->etime = timestamp; /* endtime */
	this->elapsed = this->etime - this->stime;
 	self->rw = start_rw[this->dev, this->blk];
	this->dtime = last_event[args[1]->dev_statname] == 0 ? 0 :
	    timestamp - last_event[args[1]->dev_statname];

 	/* memory cleanup */
 	start_uid[this->dev, this->blk]  = 0;
 	start_pid[this->dev, this->blk]  = 0;
 	start_ppid[this->dev, this->blk] = 0;
 	start_time[this->dev, this->blk] = 0;
 	start_comm[this->dev, this->blk] = 0;
 	start_zone[this->dev, this->blk] = 0;
 	start_proj[this->dev, this->blk] = 0;
 	start_rw[this->dev, this->blk]   = 0;

	/*
	 * Choose statistic to track
	 */
	OPT_bytes   ? this->value = args[0]->b_bcount    : 1;
	OPT_elapsed ? this->value = this->elapsed / 1000 : 1;
	OPT_dtime   ? this->value = this->dtime / 1000   : 1;
	
	/*
	 * Save details
	 */
	OPT_def ? @out[this->suid, this->spid, this->sppid, self->scomm,
	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
	    self->rw] = sum(this->value) : 1;
	OPT_proj ? @out[this->sproj, this->spid, this->sppid, self->scomm,
	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
	    self->rw] = sum(this->value) : 1;
	OPT_zone ? @out[this->szone, this->spid, this->sppid, self->scomm,
	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
	    self->rw] = sum(this->value) : 1;

	/* save last disk event */
	last_event[args[1]->dev_statname] = timestamp;

	self->scomm = 0;
	self->rw = 0;
 }

 /*
  * Prevent pending from underflowing
  * this can happen if this program is started during disk events.
  */
 io:genunix::done
 /pending[args[1]->dev_statname] < 0/
 {
	pending[args[1]->dev_statname] = 0;
 }

 /*
  * Timer
  */
 profile:::tick-1sec
 {
	secs--;
 }

 /*
  * Print Report
  */
 profile:::tick-1sec
 /secs == 0/
 {
	/* fetch 1 min load average */
	this->load1a  = `hp_avenrun[0] / 65536;
	this->load1b  = ((`hp_avenrun[0] % 65536) * 100) / 65536;

	/* convert counters to Kbytes */
	disk_r /= 1024;
	disk_w /= 1024;

	/* print status */
	OPT_clear ? printf("%s", CLEAR) : 1;
	printf("%Y,  load: %d.%02d,  disk_r: %6d KB,  disk_w: %6d KB\n\n",
	    walltimestamp, this->load1a, this->load1b, disk_r, disk_w);

	/* print headers */
	OPT_def  ? printf("  UID ") : 1;
	OPT_proj ? printf(" PROJ ") : 1;
	OPT_zone ? printf(" ZONE ") : 1;
	printf("%6s %6s %-16s %-7s %3s %3s %1s",
	    "PID", "PPID", "CMD", "DEVICE", "MAJ", "MIN", "D");
	OPT_bytes   ? printf(" %16s\n", "BYTES") : 1;
	OPT_elapsed ? printf(" %16s\n", "ELAPSED") : 1;
	OPT_dtime && ! OPT_percent  ? printf(" %16s\n", "DISKTIME") : 1;
	OPT_dtime && OPT_percent    ? printf(" %6s\n", "%I/O") : 1;

	/* truncate to top lines if needed */
	OPT_top ? trunc(@out, TOP) : 1;

	/* normalise to percentage if needed */
	OPT_percent ? normalize(@out, INTERVAL * 10000) : 1;

	/* print data */
	! OPT_percent ? 
	    printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %16@d\n", @out) :
	    printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %6@d\n", @out);
	printf("\n");

	/* clear data */
	trunc(@out);
	disk_r = 0;
	disk_w = 0;
	secs = INTERVAL;
	counts--;
 }

 /*
  * End of program
  */
 profile:::tick-1sec
 /counts == 0/
 {
	exit(0);
 }

 /*
  * Cleanup for Ctrl-C
  */
 dtrace:::END
 {
	trunc(@out);
 }
'
